In [14]:
import pandas as pd
import seaborn as sns
from pymongo import MongoClient
import matplotlib.pyplot as plt
import sklearn.pipeline
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
from sklearn.decomposition import PCA
In [2]:
#run:
# pip3 install -r requirements.txt
# to setup environment in terminal first
#Load credentials from .env file for CLI python
# import os
# from dotenv import load_dotenv
# load_dotenv()
# mongo_db_pw=os.getenv("MONGO_DB_PW")
# print(mongo_db_pw)
In [10]:
#For use with Jupyter Notebook
%run /home/cheerio/Rowan_Local/BigData/BigDataFP/env.py
In [11]:
red = '#ef4444'
green = '#84cc16'
orange = '#fb923c'
blue = '#22d3ee'
gray = '#475569'
sns.set_style('whitegrid')
In [4]:
# Connect to MongoDB
with MongoClient(f"mongodb+srv://cheerio:{MONGO_DB_PW}@cluster0.so7kree.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0") as client:
db = client["air_quality"]
collection = db["air_quality"]
# Fetch data as DataFrame from MongoDB
cursor = collection.find({})
data = pd.DataFrame(list(cursor))
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1379827 entries, 0 to 1379826 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 _id 1379827 non-null object 1 State Name 1379827 non-null object 2 Date 1379827 non-null datetime64[ns] 3 AQI 1379827 non-null int64 4 Category 1379827 non-null object 5 Defining Parameter 1379827 non-null object 6 Latitude 1379827 non-null float64 7 Longitude 1379827 non-null float64 8 County Name 1379827 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(5) memory usage: 94.7+ MB
In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifer
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadranticDiscriminantAnalysis
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Cell In[15], line 9 7 from sklearn.neural_network import MLPClassifier 8 from sklearn.neighbors import KNeighborsClassifier ----> 9 from sklearn.gaussian_process import GaussianProcessClassifer 10 from sklearn.gaussian_process.kernels import RBF 11 from sklearn.tree import DecisionTreeClassifier ImportError: cannot import name 'GaussianProcessClassifer' from 'sklearn.gaussian_process' (/home/cheerio/.local/lib/python3.10/site-packages/sklearn/gaussian_process/__init__.py)
In [24]:
# Convert 'Date' field to datetime format
data['Date'] = pd.to_datetime([d for d in data['Date']]) # List comprehension for conversion
# Extract year from the 'Date' column
data['Year'] = data['Date'].dt.year
# Reshape the data for heatmap (pivot table by year and state)
data_pivoted = data.pivot_table(values='AQI', index='Year', columns='State Name', aggfunc='mean')
# Generate heatmap using seaborn
sns.heatmap(data_pivoted)
plt.show()
In [27]:
unique_states = data['State Name'].unique()
# Print all unique state names
print("Unique State Names:")
for state in unique_states:
print(state)
Unique State Names: Alabama Alaska Arizona Arkansas California Colorado Connecticut Country Of Mexico Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico New York North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina Tennessee Texas Utah Virginia Washington West Virginia Wisconsin Wyoming Delaware District Of Columbia Kansas Puerto Rico South Dakota Vermont Virgin Islands
In [28]:
# Preprocess data for Heatmap
# Extract year from the Date column
plt.figure(figsize=(20, 10)) #Increase figure size
data['Year'] = pd.to_datetime(data['Date']).dt.year
# Calculate average AQI every year for each State
data_pivot = data.pivot_table(values='AQI', index='State Name', columns='Year', aggfunc='mean')
# Create the Heatmap
sns.heatmap(data_pivot, cmap="YlGnBu") # Adjust the colormap as desired
plt.xlabel("Year")
plt.ylabel("State/Territory")
plt.title("Heatmap of Average AQI by State and Year")
# Show all state names on the y-axis labels
plt.tight_layout() # Adjust layout to prevent overlapping labels
plt.show()
In [20]:
#GeoJSON download from https://eric.clst.org/tech/usgeojson/
import folium
# Function to generate a choropleth map for a specific year
def create_aqi_map(data_year):
# Filter data for the specific year
data_filtered = data[data['Date'].dt.year == data_year]
# Calculate average AQI every state for the year
data_pivot = data_filtered.pivot_table(values='AQI', index='State Name', aggfunc='mean')
data_pivot = data_pivot.reset_index() # Convert to DataFrame for choropleth
# Create a dictionary to map state names to latitude/longitude (if not already available)
# ... (code to create state_coords dictionary if needed)
# Create a base Folium map (adjust center coordinates and zoom level as desired)
m = folium.Map(location=[37.09024, -95.712891], zoom_start=4)
# Read the GeoJSON data
state_data = folium.GeoJson(data=open('/home/cheerio/Rowan_Local/BigData/BigDataFP/gz_2010_us_040_00_500k.json').read())
# Function to assign color based on AQI value (replace thresholds and colors as needed)
def color_mapper(avg_aqi):
if avg_aqi <= 50:
return 'green'
elif avg_aqi <= 100:
return 'yellow'
else:
return 'red'
# Add GeoJSON data with color styling based on average AQI
folium.Choropleth(
geo_data=state_data,
data=data_pivot,
locations=lambda x: state_coords.get(x['State Name']), # Lookup by state name (handle missing values)
columns=['Average AQI'],
fill_color=lambda x: color_mapper(x['Average AQI']),
fill_opacity=0.7,
line_color='white',
line_weight=1
).add_to(m)
# Add legend (optional)
m.add_legend(title='Average AQI')
# Display the map (consider saving to image or using Panel)
m
# Generate maps for each year (replace 2019 with your actual years)
for year in range(2017, 2021):
create_aqi_map(year)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[20], line 51 49 # Generate maps for each year (replace 2019 with your actual years) 50 for year in range(2017, 2021): ---> 51 create_aqi_map(year) Cell In[20], line 32, in create_aqi_map(data_year) 29 return 'red' 31 # Add GeoJSON data with color styling based on average AQI ---> 32 folium.Choropleth( 33 geo_data=state_data, 34 data=data_pivot, 35 locations=lambda x: state_coords.get(x['State Name']), # Lookup by state name (handle missing values) 36 columns=['Average AQI'], 37 fill_color=lambda x: color_mapper(x['Average AQI']), 38 fill_opacity=0.7, 39 line_color='white', 40 line_weight=1 41 ).add_to(m) 43 # Add legend (optional) 44 m.add_legend(title='Average AQI') File ~/.local/lib/python3.10/site-packages/folium/features.py:1508, in Choropleth.__init__(self, geo_data, data, columns, key_on, bins, fill_color, nan_fill_color, fill_opacity, nan_fill_opacity, line_color, line_weight, line_opacity, name, legend_name, overlay, control, show, topojson, smooth_factor, highlight, use_jenks, **kwargs) 1504 self._name = "Choropleth" 1506 fill_color = fill_color or ("blue" if data is None else "Blues") -> 1508 if data is not None and not color_brewer(fill_color): 1509 raise ValueError( 1510 "Please pass a valid color brewer code to " 1511 "fill_local. See docstring for valid codes." 1512 ) 1514 if nan_fill_opacity is None: File ~/.local/lib/python3.10/site-packages/branca/utilities.py:129, in color_brewer(color_code, n) 123 raise ValueError( 124 "The minimum number of colors in a" 125 " ColorBrewer sequential color series is 3", 126 ) 128 if not isinstance(color_code, str): --> 129 raise ValueError(f"color should be a string, not a {type(color_code)}.") 130 if color_code[-2:] == "_r": 131 base_code = color_code[:-2] ValueError: color should be a string, not a <class 'function'>.
In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
# Define the feature (AQI) and target variable (AQI category)
features = ['AQI']
target = 'Category' # Assuming 'Category' column contains labels like good, moderate, unhealthy
# Encode categorical target variable (if needed)
le = LabelEncoder()
data[target] = le.fit_transform(data[target])
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)
# Create the SVC model
model = SVC(kernel='linear') # You can try different kernels (e.g., 'rbf')
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model performance (e.g., accuracy score, confusion matrix)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
# Use the trained model to predict categories for new data points
new_data = pd.DataFrame({'AQI': [10, 150, 220]}) # Example data
new_data_predictions = model.predict(new_data)
predicted_categories = le.inverse_transform(new_data_predictions)
print(f"Predicted categories for new data: {predicted_categories}")
Model Accuracy: 1.0 Predicted categories for new data: ['Good' 'Unhealthy for Sensitive Groups' 'Very Unhealthy']
In [25]:
# Function to create a pairplot for a specific state
def state_pairplot(state_data):
# Filter data for the specific state
data_filtered = state_data[state_data['State Name'] == state]
# Create a pairplot with AQI as hue variable (colors by AQI value)
sns.pairplot(data_filtered, hue='AQI', height=3, diag_kind='hist') # Updated call
# Add a title with state name
plt.suptitle(f"AQI vs. Latitude & Longitude by Year for {state}", fontsize=14)
plt.show()
# Group data by state (assuming 'State Name' is the grouping variable)
state_groups = data.groupby('State Name')
# Iterate through each state group
for state, state_data in state_groups:
state_pairplot(state_data.copy()) # Pass a copy to avoid modifying original data
In [ ]: